0.1 libraries

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.1.8
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(phyloseq)
library(ampvis2)

0.1.1 import

setwd("~/storage/pulkovo/d2")

map <- read_tsv("/home/arina/data-processing/lab/andronov-pulkovsk-visoti/metadata_Pulkovo.txt") %>% 
  mutate(SampleID = str_replace_all(SampleID, "-", ".")) %>% 
  column_to_rownames("SampleID") %>% 
  mutate(Repeat = paste0(`Geo-eng`, "_" , Sample)) %>% 
  mutate(Site = `Geo-eng`) %>% 
  select(-c(Sample, `Geo-eng`, replicate))
## Rows: 54 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): SampleID, Geo-eng
## dbl (9): Sample, replicate, Na-water, K-water, Cl, pH-salt, pH-water, C-tota...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
otu <- read_tsv("otu_table.txt") %>% 
  column_to_rownames("...1") %>% 
  as.matrix %>% 
  t()
## New names:
## Rows: 15650 Columns: 55
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "\t" chr
## (1): ...1 dbl (54): Andronov.SEQ130.1, Andronov.SEQ130.10, Andronov.SEQ130.11,
## Androno...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
taxa <- read_tsv("taxa.txt") %>% 
  column_to_rownames("...1") %>% 
  as.matrix()
## New names:
## Rows: 15650 Columns: 8
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "\t" chr
## (8): ...1, Kingdom, Phylum, Class, Order, Family, Genus, Species
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
rep <- Biostrings::readDNAStringSet("rep_seq.fasta")

ps <- phyloseq(otu_table(otu, taxa_are_rows=FALSE),
                tax_table(taxa),
               sample_data(map),
               refseq(rep))

# 
# saveRDS(ps.f, "ps.f")
# ps.f <- readRDS("ps.f")
# ps.f

0.1.2 functions

plot_rich_reads_samlenames_lm <- function(physeq, group = "Site", label = "Repeat"){
  rish <- estimate_richness(physeq, measures = "Observed")
  reads.sum <- as.data.frame(sample_sums(physeq))
  reads.summary <- cbind(rish, reads.sum)
  colnames(reads.summary) <- c("otus","reads")
  reads.summary["Repeat"] <-unlist(purrr::map(stringr::str_split(rownames(physeq@sam_data), "\\.", 2), function(x) x[[2]]))
  reads.summary["Site"] <- physeq@sam_data[[group]]
  library(ggrepel)
  require(ggforce)
  p1 <- ggplot(data=reads.summary) + 
    geom_point(aes(y=otus, x=log2(reads), color=Site),size=3) + 
    geom_text_repel(aes(y=otus, x=log2(reads), label=paste0(Repeat))) + 
    theme_bw() +
    geom_smooth(aes(y=otus, x=log2(reads), fill=Site, color=Site),method=lm, se=FALSE, ymin = 1) + 
    scale_x_continuous(sec.axis = sec_axis(sec.axis ~ 2**.)) 

  return(p1)
}

beta_custom_norm_NMDS_elli_w <- function(ps, seed = 7888, normtype="vst", Color="What", Group="Repeat"){
  require(phyloseq)
  require(ggplot2)
  require(ggpubr)
  library(ggforce)
  
  
  ordination.b <- ordinate(ps, "NMDS", "bray")
  mds <- as.data.frame(ordination.b$points)
  p  <-  plot_ordination(ps,
                         ordination.b,
                         type="sample",
                         color = Color,
                         title="NMDS - Bray-Curtis",
                         # title=NULL,
                         axes = c(1,2) ) + 
    theme_bw() + 
    theme(text = element_text(size = 10)) + 
    geom_point(size = 3) +
    annotate("text",
    x=min(mds$MDS1) + abs(min(mds$MDS1))/7,
    y=max(mds$MDS2),
    label=paste0("Stress -- ", round(ordination.b$stress, 3))) +
    geom_mark_ellipse(aes_string(group = Group, label = Group),
                      label.fontsize = 10,
                      label.buffer = unit(2, "mm"),
                      label.minwidth = unit(5, "mm"),
                      con.cap = unit(0.1, "mm"),
                      con.colour='gray') +
    theme(legend.position = "none") +
    ggplot2::scale_fill_viridis_c(option = "H")
  
  return(p)
}

plot_alpha_w_toc <- function(ps, group, metric) {
  
  require(phyloseq)
  require(ggplot2)
  
  ps_a <-c
  
  er <- estimate_richness(ps_a)
  df_er <- cbind(ps_a@sam_data, er)
  df_er <- df_er %>% select(c(group, metric))
  stat.test <- aov(as.formula(paste0(metric, "~", group)), data = df_er) %>%
    rstatix::tukey_hsd()
  y <-  seq(max(er[[metric]]), length=length(stat.test$p.adj), by=max(er[[metric]]/20))

  plot_richness(ps_a, x=group, measures=metric) + 
    geom_boxplot() +
    geom_point(size=1.2, alpha=0.3) +
    ggpubr::stat_pvalue_manual(
      stat.test, 
      label = "p.adj.signif", 
      y.position = y) +
    theme_light() + 
    scale_color_brewer(palette="Dark2") +
    theme(axis.text.x = element_text(angle = 90),
          axis.title.x=element_blank()) +
    labs(y=paste(metric, "index")) 
}

phyloseq_to_ampvis2 <- function(physeq) {
  #check object for class
  if(!any(class(physeq) %in% "phyloseq"))
    stop("physeq object must be of class \"phyloseq\"", call. = FALSE)
  
  #ampvis2 requires taxonomy and abundance table, phyloseq checks for the latter
  if(is.null(physeq@tax_table))
    stop("No taxonomy found in the phyloseq object and is required for ampvis2", call. = FALSE)
  
  #OTUs must be in rows, not columns
  if(phyloseq::taxa_are_rows(physeq))
    abund <- as.data.frame(phyloseq::otu_table(physeq)@.Data)
  else
    abund <- as.data.frame(t(phyloseq::otu_table(physeq)@.Data))
  
  #tax_table is assumed to have OTUs in rows too
  tax <- phyloseq::tax_table(physeq)@.Data
  
  #merge by rownames (OTUs)
  otutable <- merge(
    abund,
    tax,
    by = 0,
    all.x = TRUE,
    all.y = FALSE,
    sort = FALSE
  )
  colnames(otutable)[1] <- "OTU"
  
  #extract sample_data (metadata)
  if(!is.null(physeq@sam_data)) {
    metadata <- data.frame(
      phyloseq::sample_data(physeq),
      row.names = phyloseq::sample_names(physeq), 
      stringsAsFactors = FALSE, 
      check.names = FALSE
    )
    
    #check if any columns match exactly with rownames
    #if none matched assume row names are sample identifiers
    samplesCol <- unlist(lapply(metadata, function(x) {
      identical(x, rownames(metadata))}))
    
    if(any(samplesCol)) {
      #error if a column matched and it's not the first
      if(!samplesCol[[1]])
        stop("Sample ID's must be in the first column in the sample metadata, please reorder", call. = FALSE)
    } else {
      #assume rownames are sample identifiers, merge at the end with name "SampleID"
      if(any(colnames(metadata) %in% "SampleID"))
        stop("A column in the sample metadata is already named \"SampleID\" but does not seem to contain sample ID's", call. = FALSE)
      metadata$SampleID <- rownames(metadata)
      
      #reorder columns so SampleID is the first
      metadata <- metadata[, c(which(colnames(metadata) %in% "SampleID"), 1:(ncol(metadata)-1L)), drop = FALSE]
    }
  } else
    metadata <- NULL
  
  #extract phylogenetic tree, assumed to be of class "phylo"
  if(!is.null(physeq@phy_tree)) {
    tree <- phyloseq::phy_tree(physeq)
  } else
    tree <- NULL
  
  #extract OTU DNA sequences, assumed to be of class "XStringSet"
  if(!is.null(physeq@refseq)) {
    #convert XStringSet to DNAbin using a temporary file (easiest)
    fastaTempFile <- tempfile(pattern = "ampvis2_", fileext = ".fa")
    Biostrings::writeXStringSet(physeq@refseq, filepath = fastaTempFile)
  } else
    fastaTempFile <- NULL
  
  #load as normally with amp_load
  ampvis2::amp_load(
    otutable = otutable,
    metadata = metadata,
    tree = tree,
    fasta = fastaTempFile
  )
}

filter_chrmitnas <- function(physeq){
  ps_object <- physeq 
  ps_object <- subset_taxa(ps_object, Phylum != "NA")
  
  # ps_object@tax_table[is.na(ps_object@tax_table)] <- TRUEps@tax_table@.Data
  ps_object <- subset_taxa(ps_object,
                           !(Family  == "Mitochondria" |
                               Class   == "Chloroplast" |
                               Order   == "Chloroplast"))
  # ps_object@tax_table <- dplyr::na_if(ps_object@tax_table, TRUE)
  ps.f <- ps_object
  
  out_old <- capture.output(physeq)[4] %>% 
    str_extract(regex("([1-9]).*(?= by)"))
  out_new <- capture.output(ps.f)[4] %>% 
    str_extract(regex("([1-9]).*(?= by)"))
  message(paste0("old = ", out_old))
  message(paste0("filtered = ", out_new))
  
  return(ps.f)
  
}

lsf.str()
## beta_custom_norm_NMDS_elli_w : function (ps, seed = 7888, normtype = "vst", Color = "What", Group = "Repeat")  
## filter_chrmitnas : function (physeq)  
## phyloseq_to_ampvis2 : function (physeq)  
## plot_alpha_w_toc : function (ps, group, metric)  
## plot_rich_reads_samlenames_lm : function (physeq, group = "Site", label = "Repeat")
ps.f <- filter_chrmitnas(ps)
## old = 15650 taxa
## filtered = 10719 taxa
plot_rich_reads_samlenames_lm(ps.f, group = "Site", label = "Sample")  
## Loading required package: ggforce
## `geom_smooth()` using formula = 'y ~ x'
## Warning: ggrepel: 46 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

ps.ff <- prune_samples(!sample_names(ps.f) %in% paste0("Andronov.SEQ130.",c(34, 5, 15)), ps.f)
ps.ff  <- prune_taxa(taxa_sums(ps.ff) > 0, ps.ff)


plot_rich_reads_samlenames_lm(ps.ff, group = "Site", label = "Repeat") 
## `geom_smooth()` using formula = 'y ~ x'
## Warning: ggrepel: 11 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

beta_custom_norm_NMDS_elli_w(ps.ff, Color = "Site", Group = "Site") 
## Loading required package: ggpubr
## Square root transformation
## Wisconsin double standardization
## Run 0 stress 0.040883 
## Run 1 stress 0.04088291 
## ... New best solution
## ... Procrustes: rmse 0.0001136501  max resid 0.0006866252 
## ... Similar to previous best
## Run 2 stress 0.04088302 
## ... Procrustes: rmse 0.0001093968  max resid 0.0006465295 
## ... Similar to previous best
## Run 3 stress 0.04088292 
## ... Procrustes: rmse 0.00007347523  max resid 0.0004436709 
## ... Similar to previous best
## Run 4 stress 0.04088297 
## ... Procrustes: rmse 0.00009644391  max resid 0.000579812 
## ... Similar to previous best
## Run 5 stress 0.040883 
## ... Procrustes: rmse 0.00005250913  max resid 0.0002992113 
## ... Similar to previous best
## Run 6 stress 0.04088288 
## ... New best solution
## ... Procrustes: rmse 0.00005536902  max resid 0.0003120867 
## ... Similar to previous best
## Run 7 stress 0.04088294 
## ... Procrustes: rmse 0.00003866784  max resid 0.0002017634 
## ... Similar to previous best
## Run 8 stress 0.04088305 
## ... Procrustes: rmse 0.00007407182  max resid 0.0004018779 
## ... Similar to previous best
## Run 9 stress 0.04088291 
## ... Procrustes: rmse 0.00001937751  max resid 0.00009890726 
## ... Similar to previous best
## Run 10 stress 0.0408829 
## ... Procrustes: rmse 0.0000134097  max resid 0.0000675259 
## ... Similar to previous best
## Run 11 stress 0.04088293 
## ... Procrustes: rmse 0.00003389185  max resid 0.0001990864 
## ... Similar to previous best
## Run 12 stress 0.04088291 
## ... Procrustes: rmse 0.00002012836  max resid 0.0001021024 
## ... Similar to previous best
## Run 13 stress 0.040883 
## ... Procrustes: rmse 0.00006382984  max resid 0.0003486531 
## ... Similar to previous best
## Run 14 stress 0.04088291 
## ... Procrustes: rmse 0.00001481653  max resid 0.00006931742 
## ... Similar to previous best
## Run 15 stress 0.04088301 
## ... Procrustes: rmse 0.00006630298  max resid 0.0003419541 
## ... Similar to previous best
## Run 16 stress 0.04088302 
## ... Procrustes: rmse 0.00005111931  max resid 0.0002736245 
## ... Similar to previous best
## Run 17 stress 0.04088287 
## ... New best solution
## ... Procrustes: rmse 0.00002953964  max resid 0.0001524292 
## ... Similar to previous best
## Run 18 stress 0.04088306 
## ... Procrustes: rmse 0.0001135854  max resid 0.000592004 
## ... Similar to previous best
## Run 19 stress 0.04088294 
## ... Procrustes: rmse 0.00005696751  max resid 0.0002208853 
## ... Similar to previous best
## Run 20 stress 0.04088291 
## ... Procrustes: rmse 0.00004956004  max resid 0.0002547419 
## ... Similar to previous best
## *** Best solution repeated 4 times
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation ideoms with `aes()`

ps.r <- rarefy_even_depth(ps.ff,rngseed = 1221)
## `set.seed(1221)` was used to initialize repeatable random subsampling.
## Please record this for your records so others can reproduce.
## Try `set.seed(1221); .Random.seed` for the full vector
## ...
## 421OTUs were removed because they are no longer 
## present in any sample after random subsampling
## ...
ps.r
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 10275 taxa and 51 samples ]
## sample_data() Sample Data:       [ 51 samples by 9 sample variables ]
## tax_table()   Taxonomy Table:    [ 10275 taxa by 7 taxonomic ranks ]
## refseq()      DNAStringSet:      [ 10275 reference sequences ]
beta_custom_norm_NMDS_elli_w(ps.r, Color = "Site", Group = "Site") 
## Square root transformation
## Wisconsin double standardization
## Run 0 stress 0.03281294 
## Run 1 stress 0.03281294 
## ... New best solution
## ... Procrustes: rmse 0.00003300971  max resid 0.0001301202 
## ... Similar to previous best
## Run 2 stress 0.03281293 
## ... New best solution
## ... Procrustes: rmse 0.000001794148  max resid 0.00000636754 
## ... Similar to previous best
## Run 3 stress 0.03281305 
## ... Procrustes: rmse 0.00006464719  max resid 0.0002555404 
## ... Similar to previous best
## Run 4 stress 0.03281293 
## ... New best solution
## ... Procrustes: rmse 0.000003331408  max resid 0.00001223961 
## ... Similar to previous best
## Run 5 stress 0.03281296 
## ... Procrustes: rmse 0.00002995194  max resid 0.0001188467 
## ... Similar to previous best
## Run 6 stress 0.03281301 
## ... Procrustes: rmse 0.00005133915  max resid 0.0002033569 
## ... Similar to previous best
## Run 7 stress 0.032813 
## ... Procrustes: rmse 0.00005186784  max resid 0.0002055802 
## ... Similar to previous best
## Run 8 stress 0.03281301 
## ... Procrustes: rmse 0.00005504832  max resid 0.0002181154 
## ... Similar to previous best
## Run 9 stress 0.03281294 
## ... Procrustes: rmse 0.00001107934  max resid 0.00004250849 
## ... Similar to previous best
## Run 10 stress 0.03281294 
## ... Procrustes: rmse 0.00000897434  max resid 0.00002738668 
## ... Similar to previous best
## Run 11 stress 0.03281301 
## ... Procrustes: rmse 0.00004782254  max resid 0.0001885485 
## ... Similar to previous best
## Run 12 stress 0.03281304 
## ... Procrustes: rmse 0.00006446968  max resid 0.0002554987 
## ... Similar to previous best
## Run 13 stress 0.03281293 
## ... New best solution
## ... Procrustes: rmse 0.00001661198  max resid 0.00006464267 
## ... Similar to previous best
## Run 14 stress 0.03281305 
## ... Procrustes: rmse 0.00008591437  max resid 0.0003396772 
## ... Similar to previous best
## Run 15 stress 0.03281303 
## ... Procrustes: rmse 0.00007177757  max resid 0.000282788 
## ... Similar to previous best
## Run 16 stress 0.03281304 
## ... Procrustes: rmse 0.0000820854  max resid 0.0003245479 
## ... Similar to previous best
## Run 17 stress 0.03281295 
## ... Procrustes: rmse 0.00003182524  max resid 0.0001257578 
## ... Similar to previous best
## Run 18 stress 0.03281302 
## ... Procrustes: rmse 0.00007563761  max resid 0.0002991288 
## ... Similar to previous best
## Run 19 stress 0.03281297 
## ... Procrustes: rmse 0.00004755598  max resid 0.0001879429 
## ... Similar to previous best
## Run 20 stress 0.03281294 
## ... Procrustes: rmse 0.00002613964  max resid 0.0001031611 
## ... Similar to previous best
## *** Best solution repeated 8 times

ps.merged <- merge_samples(ps.ff, "Repeat")
## Warning in asMethod(object): NAs introduced by coercion

## Warning in asMethod(object): NAs introduced by coercion
d_meta <- ps.merged@sam_data %>% 
  data.frame() %>% 
  select(c("Na.water", "K.water", "Cl", "pH.salt", "C.total", "Electro")) %>% 
  scale() %>% 
  as.data.frame()

d_otu <- phyloseq::distance(ps.merged, "bray")

dec <- sort(c("Na.water", "K.water", "Cl", "pH.salt", "C.total", "Electro"), decreasing = TRUE )
sort <- sort(c("Na.water", "K.water", "Cl", "pH.salt", "C.total", "Electro"), decreasing = FALSE )

f_dec <- as.formula(paste0("d_otu ~", noquote(paste(dec, collapse=" + "))))
f_sort <- as.formula(paste0("d_otu ~", noquote(paste(sort, collapse=" + "))))


list(
  order_forward = vegan::adonis2(f_dec, data = d_meta),
  order_backward = vegan::adonis2(f_sort, data = d_meta)
)
## $order_forward
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## vegan::adonis2(formula = f_dec, data = d_meta)
##          Df SumOfSqs      R2      F Pr(>F)    
## pH.salt   1  0.77383 0.46702 9.8349  0.001 ***
## Na.water  1  0.34096 0.20578 4.3334  0.022 *  
## K.water   1  0.13181 0.07955 1.6752  0.271    
## Electro   1  0.07642 0.04612 0.9713  0.495    
## Cl        1  0.11950 0.07212 1.5187  0.259    
## C.total   1  0.05705 0.03443 0.7251  0.616    
## Residual  2  0.15736 0.09497                  
## Total     8  1.65694 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## $order_backward
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## vegan::adonis2(formula = f_sort, data = d_meta)
##          Df SumOfSqs      R2      F Pr(>F)    
## C.total   1  0.69264 0.41802 8.8030  0.001 ***
## Cl        1  0.31875 0.19237 4.0511  0.020 *  
## Electro   1  0.13994 0.08446 1.7785  0.202    
## K.water   1  0.16173 0.09761 2.0554  0.151    
## Na.water  1  0.07581 0.04575 0.9634  0.500    
## pH.salt   1  0.11072 0.06682 1.4072  0.293    
## Residual  2  0.15736 0.09497                  
## Total     8  1.65694 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ps.ff@sam_data %>% 
  data.frame() %>%
  # rownames_to_column("shit") %>% 
  # column_to_rownames("Name") %>% 
  select(which(sapply(., is.numeric)), Site) %>% 
  group_by(Site) %>% 
  summarise(across(everything(), mean),
            .groups = 'drop') %>% 
  column_to_rownames("Site") %>% 
  scale() %>% 
  heatmaply::ggheatmap()
## Registered S3 method overwritten by 'dendextend':
##   method     from 
##   rev.hclust vegan
## Registered S3 method overwritten by 'seriation':
##   method         from 
##   reorder.hclust vegan

library(ggrepel)

ps.merged <- phyloseq::merge_samples(ps.ff, "Repeat")
## Warning in asMethod(object): NAs introduced by coercion

## Warning in asMethod(object): NAs introduced by coercion
physeq <- ps.ff


veganifyOTU <- function(physeq){
  require(phyloseq)
  if(taxa_are_rows(physeq)){physeq <- t(physeq)}
  return(as(otu_table(physeq), "matrix"))
}

otus.ps.vegan <- veganifyOTU(physeq)
rownames(otus.ps.vegan) <- ps.ff@sam_data$Repeat
metadata <- physeq@sam_data %>% 
  data.frame() %>% 
  select(c("Na.water", "K.water", "Cl", "pH.salt", "C.total", "Electro"))

dec <- sort(c("Na.water", "K.water", "Cl", "pH.salt", "C.total", "Electro"), decreasing = TRUE )
sort <- sort(c("Na.water", "K.water", "Cl", "pH.salt", "C.total", "Electro"), decreasing = FALSE )

f_dec <- as.formula(paste0("otus.ps.vegan ~", noquote(paste(dec, collapse=" + "))))
f_sort <- as.formula(paste0("otus.ps.vegan ~", noquote(paste(sort, collapse=" + "))))


cca_dec <- vegan::cca(f_dec, data=metadata)
cca_sort <- vegan::cca(f_sort, data=metadata)

kate.ggcca.sites <- function(vare.cca){

library(tidyverse)
biplot <- as.data.frame(vare.cca$CCA$biplot)
wa <- as.data.frame(vare.cca$CCA$wa)

biplot <- rownames_to_column(biplot, "Label") %>% 
  add_column(Score = rep("biplot", length(rownames(biplot))))
wa <- rownames_to_column(wa, "Label") %>% 
  add_column(Score = rep("sites", length(rownames(wa))))
fdat_amazing <- rbind(biplot, wa) 
fdat_amazing <- fdat_amazing %>%
  mutate(label_size = ifelse(Score == "biplot", 4.5, 3))
 
p <- ggplot(fdat_amazing %>% filter(Score %in% c("sites","biplot"))) + 
  # geom_abline(intercept=seq(-100, 100, 25), slope=1, colour="grey", alpha=0.5) +
  # geom_abline(intercept=seq(-100, 100, 25), slope=-1, colour="grey", alpha=0.5) +
  geom_vline(xintercept = 0, size = 0.75, color = "#737373", alpha=0.5) +
  geom_hline(yintercept = 0, size = 0.75, color = "#737373", alpha=0.5) +
  geom_point(data = fdat_amazing %>% dplyr::filter(Score == "sites"), mapping = aes(x=CCA1, y=CCA2, colour = factor(Score))) + 
  geom_segment(data = fdat_amazing %>% dplyr::filter(Score == "biplot"), aes(x = 0, xend = CCA1, y = 0, yend = CCA2),
               size = 0.6, alpha=0.8, color = "red",arrow = arrow(angle = 3)) +
  geom_text_repel(aes(x=CCA1, y=CCA2, label= Label),size=fdat_amazing$label_size) + 
  xlab(paste0(round(vare.cca$CCA$eig[1], 3)*100, "% CCA1")) +
  ylab(paste0(round(vare.cca$CCA$eig[2], 3)*100, "% CCA2")) +
  grids(linetype = "dashed") +
  theme(legend.position = "none", panel.background = element_rect(fill = "white", colour = "grey50"))
  return(p)
}

cca.sites <- kate.ggcca.sites(cca_dec)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
cca.sites
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

vegan::vif.cca(cca_dec)
##    pH.salt   Na.water    K.water    Electro         Cl    C.total 
##  12.164455  27.848814   1.678713 191.952968 215.325139  30.290554
library(ggrepel)

ps.fff <-  phyloseq::filter_taxa(ps.ff, function(x) sum(x > 3) > (0.1*length(x)), TRUE)
# ps.varstab <- ps_vst(ps.fff, "type")
ps.varstab <- ps.ff
# ps.varstab.merged <- phyloseq::merge_samples(ps.varstab, "Repeat")
# ps.varstab.merged.family <- tax_glom(ps.fff, taxrank="Family")

physeq <- ps.fff
ps.varstab <- physeq
veganifyOTU <- function(physeq){
  require(phyloseq)
  if(taxa_are_rows(physeq)){physeq <- t(physeq)}
  return(as(otu_table(physeq), "matrix"))
}

otus.ps.vegan <- veganifyOTU(physeq)
metadata <- as(sample_data(physeq), "data.frame") 
cca_w_varstab_asv <- vegan::cca(f_dec,  data=metadata)

wa <- as.data.frame(cca_w_varstab_asv$CCA$v) %>% 
  rownames_to_column("ID")

taxa.pruned <- as.data.frame(ps.varstab@tax_table@.Data) %>% 
  rownames_to_column("ID")

taxa.pruned <- taxa.pruned %>%  
  mutate_all(as.character)

# old.tips <- tree$tip.label
# matches <- regmatches(tree$tip.label, gregexpr("[[:digit:]]+", tree$tip.label))
# taxa.pruned$number <- as.numeric(unlist(matches))
# Shitty taxa formating part

taxa.pruned$taxa <- ifelse(is.na(taxa.pruned$Genus),
                            ifelse(is.na(taxa.pruned$Family),
                            ifelse(is.na(taxa.pruned$Order), 
                            ifelse(is.na(taxa.pruned$Class), taxa.pruned$Phylum, taxa.pruned$Class) , taxa.pruned$Order), taxa.pruned$Family), taxa.pruned$Genus)
taxa.pruned[taxa.pruned == "Burkholderia-Caballeronia-Paraburkholderia"] <- "Burkholderia"
taxa.pruned[taxa.pruned == "Allorhizobium-Neorhizobium-Pararhizobium-Rhizobium"] <- "Pararhizobium"

taxa.pruned$taxa2 <- ifelse(is.na(taxa.pruned$Species),
                            with(taxa.pruned, paste0(taxa)),
                            with(taxa.pruned, paste0(taxa, " ", Species )))
taxa.pruned$phylum <- ifelse(taxa.pruned$Phylum == "Proteobacteria", with(taxa.pruned, paste0(taxa.pruned$Class)), with(taxa.pruned, paste0(taxa.pruned$Phylum)))
taxa.pruned$Label <- paste0(taxa.pruned$ID, "_", taxa.pruned$taxa2)

wa <- full_join(wa, taxa.pruned, by="ID") %>% 
  mutate(Score = "sites")

#For the top 50 most abundant taxa, skip if use des res
head_asv <- ps.varstab@otu_table@.Data %>% 
  data.frame %>% 
  colSums() %>% 
  sort(decreasing = TRUE) %>% 
  head(n=100) %>% 
  names()
  

wa <- filter(wa, ID %in% head_asv)
# maybe only different by des? Picture will be nice.
# wa <- filter(wa, ID %in% row.des)
# wa <- filter(wa, ID %in% row.des.so)

biplot <- as.data.frame(cca_w_varstab_asv$CCA$biplot)
biplot <- rownames_to_column(biplot, "Label") %>% 
  add_column(Score = rep("biplot", length(rownames(biplot))))

fdat_amazing <- plyr::rbind.fill(biplot, wa) %>% 
  
  mutate(Label = as.factor(Label))
fdat_amazing <- fdat_amazing %>%
  mutate(label_size = ifelse(Score == "biplot", 4.5, 3))

p.cca.species <- ggplot(fdat_amazing %>% filter(Score %in% c("sites","biplot"))) + 
  geom_point(data = fdat_amazing %>% dplyr::filter(Score == "sites"), mapping = aes(x=CCA1, y=CCA2)) + 
  geom_segment(data = fdat_amazing %>% dplyr::filter(Score == "biplot"), aes(x = 0, xend = CCA1, y = 0, yend = CCA2), alpha=0.8, color = "red", arrow = arrow(angle = 3)) +
  geom_text_repel(aes(x=CCA1, y=CCA2, label= Label, colour = phylum), size=fdat_amazing$label_size) + 
  xlab(paste0(round(cca_w_varstab_asv$CCA$eig[1], 3)*100, "% CCA1")) +
  ylab(paste0(round(cca_w_varstab_asv$CCA$eig[2], 3)*100, "% CCA2")) +
  grids(linetype = "dashed") +
  geom_vline(xintercept = 0, size = 0.75, color = "#737373", alpha=0.5) +
  geom_hline(yintercept = 0, size = 0.75, color = "#737373", alpha=0.5) +
  theme(legend.position = "top", panel.background = element_rect(fill = "white", colour = "grey50"))


p.cca.species
## Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

amp <- phyloseq_to_ampvis2(ps.ff)

amp_heatmap(amp,
            tax_show = 22,
            group_by = "Site",
            tax_aggregate = "Phylum",
            tax_class = "Proteobacteria",
            normalise=TRUE,
            showRemainingTaxa = TRUE)

ps.inall <- phyloseq::filter_taxa(ps.ff, function(x) sum(x > 10) > (0.1*length(x)), TRUE)

amp.inall <- phyloseq_to_ampvis2(ps.inall)
amp_heatmap(amp.inall,
            tax_show = 40,
            group_by = "Site",
            tax_aggregate = "OTU",
            tax_add = "Genus",
            normalise=TRUE,
            showRemainingTaxa = TRUE)
## Warning: Transformation introduced infinite values in discrete y-axis

ps.exl  <- phyloseq::filter_taxa(ps.ff, function(x) sum(x > 10) < (0.1*length(x)), TRUE)

ps.per  <-  phyloseq::transform_sample_counts(ps.ff, function(x) x / sum(x) * 100) 
ps.exl.taxa <- taxa_names(ps.exl)
ps.per.exl <- prune_taxa(ps.exl.taxa, ps.per)
amp.exl.r <- phyloseq_to_ampvis2(ps.per.exl)

amp_heatmap(amp.exl.r, tax_show = 60, 
      group_by = "Site", 
      tax_aggregate = "OTU",
      tax_add = "Genus", 
      round =  2, 
      normalise=FALSE, 
      showRemainingTaxa = FALSE)
## Warning: Transformation introduced infinite values in discrete y-axis

# ancom_da_glob <- ANCOMBC::ancombc(phyloseq = ps.inall,
#                          formula = "Site",
#                          p_adj_method = "fdr",
#                          lib_cut = 1000,
#                          group = "Site",
#                          struc_zero = TRUE,
#                          neg_lb = TRUE,
#                          tol = 1e-5,
#                          max_iter = 100,
#                          conserve = TRUE,
#                          alpha = 0.05,
#                          global = TRUE)

set.seed(123)
ancom <-  ANCOMBC::ancombc2(data = ps.inall, 
  tax_level = NULL,
  fix_formula = "Site + Electro + pH.salt",
  rand_formula = NULL,
  p_adj_method = "fdr",
  pseudo = 0, 
  pseudo_sens = TRUE,
  prv_cut = 0.10,
  lib_cut = 1000,
  s0_perc = 0.05,
  group = "Site",
  struc_zero = TRUE,
  neg_lb = TRUE,
  alpha = 0.05,
  n_cl = 10, 
  verbose = TRUE,
  global = TRUE, 
  pairwise = TRUE, 
  dunnet = TRUE, 
  trend = FALSE,
  iter_control = list(tol = 1e-2, max_iter = 20, verbose = TRUE),
  em_control = list(tol = 1e-5, max_iter = 100),
  lme_control = lme4::lmerControl(),
  mdfdr_control = list(fwer_ctrl_method = "holm", B = 100),
  trend_control = list(contrast = list(matrix(c(1, 0, -1, 1),
    nrow = 2,
    byrow = TRUE),
    matrix(c(-1, 0, 1, -1),
    nrow = 2,
    byrow = TRUE)),
   node = list(2, 2),
   solver = "ECOS",
  B = 100)
  )
## Registered S3 methods overwritten by 'proxy':
##   method               from    
##   print.registry_field registry
##   print.registry_entry registry
## `tax_level` is not speficified 
## No agglomeration will be performed
## Otherwise, please speficy `tax_level` by one of the following: 
## Kingdom, Phylum, Class, Order, Family, Genus, Species
## Obtaining initial estimates ...
## ML iteration = 1, epsilon = 1.3
## ML iteration = 2, epsilon = 8.3e-14
## Estimating sample-specific biases ...
## ANCOM-BC2 primary results ...
## The sensitivity analysis for the pseudo-count addition ...
## ANCOM-BC2 global test ...
## ANCOM-BC2 pairwise directional test ...
## ANCOM-BC2 Dunnet's type of test ...
samp_frac = ancom$samp_frac
# Replace NA with 0
samp_frac[is.na(samp_frac)] = 0 
# Add pesudo-count (1) to avoid taking the log of 0
log_obs_abn = log(ancom$feature_table + 1)
# Adjust the log observed abundances
log_corr_abn = t(t(log_obs_abn) - samp_frac)
# Show the first 6 samples
round(log_corr_abn[, 1:6], 2) %>% 
  DT::datatable(caption = "Bias-corrected log observed abundances")
ancom$res_pair %>% 
  DT::datatable(caption = "Bias-corrected log observed abundances")
sum <- log_corr_abn %>% 
  as.data.frame() %>% 
  rowSums() 

log_corr_abn %>% 
  t() %>% 
  as.data.frame() %>% 
  length()
## [1] 532
ps.an <- prune_taxa(taxa_names(ps.ff) %in% rownames(log_corr_abn), ps.ff)
otu_table(ps.an) <- otu_table(t(log_corr_abn), taxa_are_rows = FALSE)
ps.an <- merge_samples(ps.an, "Site", fun = mean)
## Warning in asMethod(object): NAs introduced by coercion

## Warning in asMethod(object): NAs introduced by coercion
mean.d <- ps.an@otu_table %>% 
  t() %>% 
  data.frame() %>% 
  mutate(
    lfc_Sitehigh_road = (Ashan + high_road)/2,
    lfc_SiteMiddle = (Ashan + Middle)/2,
    lfc_SiteMiddle_Sitehigh_road = (Middle + high_road)/2
  ) %>% 
rownames_to_column("taxon") %>%  
  select(c("taxon","lfc_Sitehigh_road", "lfc_SiteMiddle", "lfc_SiteMiddle_Sitehigh_road")) %>% 
  pivot_longer(!taxon, names_to = "pair", values_to = "mean")

sum.d <- data.frame(taxon = names(sum), sum = sum, row.names = NULL)

lfc <- ancom$res_pair %>% 
  select(c("taxon","lfc_Sitehigh_road", "lfc_SiteMiddle", "lfc_SiteMiddle_Sitehigh_road"))

data_for_plot <- ancom$res_pair %>% 
  select(c("taxon","lfc_Sitehigh_road", "lfc_SiteMiddle", "lfc_SiteMiddle_Sitehigh_road")) %>% 
  pivot_longer(c("lfc_Sitehigh_road", "lfc_SiteMiddle", "lfc_SiteMiddle_Sitehigh_road"), names_to = "pair") %>% 
  left_join(mean.d, by=c("taxon", "pair"))

data_for_plot
## # A tibble: 1,596 × 4
##    taxon pair                           value  mean
##    <chr> <chr>                          <dbl> <dbl>
##  1 Seq1  lfc_Sitehigh_road            -2.52    99.6
##  2 Seq1  lfc_SiteMiddle               -3.31    89.9
##  3 Seq1  lfc_SiteMiddle_Sitehigh_road -0.788  108. 
##  4 Seq2  lfc_Sitehigh_road             0.681   88.3
##  5 Seq2  lfc_SiteMiddle                0.664   80.3
##  6 Seq2  lfc_SiteMiddle_Sitehigh_road -0.0171 109. 
##  7 Seq3  lfc_Sitehigh_road            -2.65    88.5
##  8 Seq3  lfc_SiteMiddle               -2.20    84.0
##  9 Seq3  lfc_SiteMiddle_Sitehigh_road  0.459  107. 
## 10 Seq5  lfc_Sitehigh_road            -1.77    84.4
## # … with 1,586 more rows
data_for_plot %>% 
  ggplot(aes(x=mean, y=value, label=taxon, color=pair)) + 
  geom_point() +
  geom_smooth(aes(y=value, x=mean, fill=pair, color=pair),
              method=lm, 
              method.args = list(family = "quasipoisson"), 
              se=TRUE) +
  theme_bw() +
  facet_wrap(~pair)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: In lm.wfit(x, y, w, offset = offset, singular.ok = singular.ok, 
##     ...) :
##  extra argument 'family' will be disregarded
## Warning: The following aesthetics were dropped during statistical transformation: label
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## Warning: In lm.wfit(x, y, w, offset = offset, singular.ok = singular.ok, 
##     ...) :
##  extra argument 'family' will be disregarded
## Warning: The following aesthetics were dropped during statistical transformation: label
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## Warning: In lm.wfit(x, y, w, offset = offset, singular.ok = singular.ok, 
##     ...) :
##  extra argument 'family' will be disregarded
## Warning: The following aesthetics were dropped during statistical transformation: label
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

l <- lm(value ~ mean*pair, data=data_for_plot) 
summary(l)
## 
## Call:
## lm(formula = value ~ mean * pair, data = data_for_plot)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.2166 -1.2223  0.0452  1.3728  7.6588 
## 
## Coefficients:
##                                        Estimate Std. Error t value     Pr(>|t|)
## (Intercept)                            1.347724   0.401319   3.358     0.000803
## mean                                  -0.042755   0.007622  -5.609 0.0000000239
## pairlfc_SiteMiddle                    -0.188361   0.531089  -0.355     0.722885
## pairlfc_SiteMiddle_Sitehigh_road      -1.592645   0.530068  -3.005     0.002701
## mean:pairlfc_SiteMiddle               -0.010654   0.010575  -1.007     0.313867
## mean:pairlfc_SiteMiddle_Sitehigh_road  0.039059   0.009855   3.963 0.0000771604
##                                          
## (Intercept)                           ***
## mean                                  ***
## pairlfc_SiteMiddle                       
## pairlfc_SiteMiddle_Sitehigh_road      ** 
## mean:pairlfc_SiteMiddle                  
## mean:pairlfc_SiteMiddle_Sitehigh_road ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.084 on 1590 degrees of freedom
## Multiple R-squared:  0.07491,    Adjusted R-squared:  0.072 
## F-statistic: 25.75 on 5 and 1590 DF,  p-value: < 2.2e-16